getwd()
## [1] "/Users/pankaj/dev/git/smu/qtw/case_study3"
setwd("/Users/pankaj/dev/git/smu/qtw/case_study3/Week_5_Materials_2")
load("data.Rda")
length(emailDFrp)
## [1] 30
plot(colSums(is.na(emailDFrp)))
df1= emailDFrp
df1$id = seq(1, length(emailDFrp$isRe))
df1 %>%
ggplot(mapping = aes(x= isRe, fill = isSpam))+
geom_bar()
df1 %>%
ggplot(mapping = aes(x= underscore, fill = isSpam))+
geom_bar()
df1 %>%
ggplot(mapping = aes(x= priority, fill = isSpam))+
geom_bar()
df1 %>%
ggplot(mapping = aes(x= isInReplyTo, fill = isSpam))+
geom_bar()
df1 %>%
ggplot(mapping = aes(x= sortedRec, fill = isSpam))+
geom_bar()
df1 %>%
ggplot(mapping = aes(x= subPunc, fill = isSpam))+
geom_bar()
df1 %>%
ggplot(mapping = aes(x= multipartText, fill = isSpam))+
geom_bar()
df1 %>%
ggplot(mapping = aes(x= hasImages, fill = isSpam))+
geom_bar()
df1 %>%
ggplot(mapping = aes(x= isPGPsigned, fill = isSpam))+
geom_bar()
df1 %>%
ggplot(mapping = aes(x= subSpamWords, fill = isSpam))+
geom_bar()
df1 %>%
ggplot(mapping = aes(x= noHost, fill = isSpam))+
geom_bar()
df1 %>%
ggplot(mapping = aes(x= numEnd, fill = isSpam))+
geom_bar()
df1 %>%
ggplot(mapping = aes(x= isYelling, fill = isSpam))+
geom_bar()
df1 %>%
ggplot(mapping = aes(x= isOrigMsg, fill = isSpam))+
geom_bar()
df1 %>%
ggplot(mapping = aes(x= isDear, fill = isSpam))+
geom_bar()
df1 %>%
ggplot(mapping = aes(x= isWrote, fill = isSpam))+
geom_bar()
df1 %>%
ggplot(mapping = aes(x= subExcCt, fill = isSpam))+
geom_bar()
## Warning: Removed 20 rows containing non-finite values (stat_count).
df1 %>%
ggplot(mapping = aes(x= subQuesCt, fill = isSpam))+
geom_bar()
## Warning: Removed 20 rows containing non-finite values (stat_count).
df1 %>%
ggplot(mapping = aes(x= numLines, fill = isSpam))+
geom_bar(binwidth = 100)
## Warning: `geom_bar()` no longer has a `binwidth` parameter. Please use
## `geom_histogram()` instead.
df1 %>%
ggplot(mapping = aes(x= numDlr, fill = isSpam))+
geom_bar(binwidth = 100)
## Warning: `geom_bar()` no longer has a `binwidth` parameter. Please use
## `geom_histogram()` instead.
df1 %>%
ggplot(mapping = aes(x= numAtt, fill = isSpam))+
geom_bar()
df1 %>%
ggplot(mapping = aes(x= numRec, fill = isSpam))+
geom_bar(binwidth = 100)
## Warning: `geom_bar()` no longer has a `binwidth` parameter. Please use
## `geom_histogram()` instead.
## Warning: Removed 282 rows containing non-finite values (stat_bin).
df1 %>%
ggplot(mapping = aes(x= hour, fill = isSpam))+
geom_bar()
df1 %>%
ggplot(mapping = aes(x= perHTML, fill = isSpam))+
geom_bar()
## Warning: position_stack requires non-overlapping x intervals
df1 %>%
ggplot(mapping = aes(x= forwards , color = isSpam))+
geom_freqpoly( binwidth= 1)
df1 %>%
ggplot(mapping = aes(x= perCaps , color = isSpam))+
geom_freqpoly(binwidth= 1 )
df1 %>%
ggplot(mapping = aes(x= bodyCharCt , color = isSpam))+
geom_freqpoly( binwidth= 100)
df1 %>%
ggplot(mapping = aes(x= avgWordLen , color = isSpam))+
geom_freqpoly(binwidth= 1 )
df1 %>%
ggplot(mapping = aes(x= subBlanks , color = isSpam))+
geom_freqpoly( binwidth= 1)
## Warning: Removed 20 rows containing non-finite values (stat_bin).
## Confusion Matrix and Statistics
##
## pred.rpint1
## F T
## F 3489 3
## T 1122 60
##
## Accuracy : 0.7593
## 95% CI : (0.7468, 0.7715)
## No Information Rate : 0.9865
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.0727
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.75667
## Specificity : 0.95238
## Pos Pred Value : 0.99914
## Neg Pred Value : 0.05076
## Prevalence : 0.98652
## Detection Rate : 0.74647
## Detection Prevalence : 0.74711
## Balanced Accuracy : 0.85452
##
## 'Positive' Class : F
##
train.control <- trainControl(
method = "repeatedcv",
number = 10, ## 10-fold CV
repeats = 3,## repeated three times
# USE AUC
summaryFunction = twoClassSummary,
classProbs = TRUE
)
tune.gridcart <- expand.grid(maxdepth = 2:15)
#tune.gridcart <- expand.grid(cp = seq(2,10))
system.time (rpartFit2 <- train(isSpam ~ . ,
data = emailDFrp,
method = "rpart2",
tuneGrid =tune.gridcart,
trControl = train.control,
metric = "ROC", na.action = na.pass
))
## user system elapsed
## 31.103 1.376 13.555
rpartFit2
## CART
##
## 9348 samples
## 29 predictor
## 2 classes: 'F', 'T'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 3 times)
## Summary of sample sizes: 8413, 8414, 8414, 8413, 8414, 8412, ...
## Resampling results across tuning parameters:
##
## maxdepth ROC Sens Spec
## 2 0.7585786 0.9246642 0.5878144
## 3 0.7761748 0.9501753 0.5849152
## 4 0.7789992 0.9683019 0.5762930
## 5 0.8001320 0.9705070 0.5967329
## 6 0.9161483 0.9513262 0.7352371
## 7 0.9161483 0.9513262 0.7352371
## 8 0.9164148 0.9510385 0.7366318
## 9 0.9177516 0.9530529 0.7377586
## 10 0.9187233 0.9554506 0.7377586
## 11 0.9255864 0.9471549 0.7707078
## 12 0.9289536 0.9450444 0.7860094
## 13 0.9348707 0.9428388 0.8093579
## 14 0.9363542 0.9450928 0.8128382
## 15 0.9371638 0.9479696 0.8149262
##
## ROC was used to select the optimal model using the largest value.
## The final value used for the model was maxdepth = 15.
plot(rpartFit2)
From above grid search it looks like ROC curve area increases with number of depth but the increase is not much after max depth 6. so that one we ll choose for our final model
emailDFrp.rp <-rpart(isSpam ~ . , # formula
data = emailDFrp, # data frame
subset = train, # indices of training set
method = "class", # classification tree
parms = list(split = "information"), # use entropy/deviance
maxsurrogate = 0, # since no missing values
cp = 0, # no size penalty
minsplit = 5, # Nodes of size 5 (or # morecan be split,
minbucket = 2,
# provided each sub-node
# contains at least 2 obs.
control=rpart.control(maxdepth=6)
)
#summary(emailDFrp.rp)
plot(emailDFrp.rp,
uniform=TRUE,
compress=TRUE,
margin = .2)
text(emailDFrp.rp,
use.n=TRUE,
all = TRUE,
fancy = TRUE)
fancyRpartPlot(emailDFrp.rp, main="Decision Tree Graph")
confusion matrix for model based on all vars
pred.rpint <- predict(emailDFrp.rp,
newdata = emailDFrp[-train,],
type = "class")
#
# Classification table on the test data
#
cm = table(emailDFrp$isSpam[-train], pred.rpint)
confusionMatrix(cm)
## Confusion Matrix and Statistics
##
## pred.rpint
## F T
## F 3387 105
## T 490 692
##
## Accuracy : 0.8727
## 95% CI : (0.8628, 0.8821)
## No Information Rate : 0.8295
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.6224
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.8736
## Specificity : 0.8683
## Pos Pred Value : 0.9699
## Neg Pred Value : 0.5854
## Prevalence : 0.8295
## Detection Rate : 0.7246
## Detection Prevalence : 0.7471
## Balanced Accuracy : 0.8709
##
## 'Positive' Class : F
##
emailDFrp.rp$variable.importance
## forwards isInReplyTo isRe perCaps isWrote bodyCharCt
## 535.602402 365.401656 323.694079 309.072712 264.207776 214.666202
## numLines subBlanks subExcCt numRec isOrigMsg avgWordLen
## 170.442266 121.321644 74.526799 38.481899 32.934959 22.330894
## numDlr perHTML priority isPGPsigned subQuesCt
## 9.908102 5.585596 4.718144 4.303511 4.140931
# split into train and validation
library(data.tree)
createDataTree <- function(h2oTree) {
h2oTreeRoot = h2oTree@root_node
dataTree = Node$new(h2oTreeRoot@split_feature)
dataTree$type = 'split'
addChildren(dataTree, h2oTreeRoot)
return(dataTree)
}
addChildren <- function(dtree, node) {
if(class(node)[1] != 'H2OSplitNode') return(TRUE)
feature = node@split_feature
id = node@id
na_direction = node@na_direction
if(is.na(node@threshold)) {
leftEdgeLabel = printValues(node@left_levels,
na_direction=='LEFT', 4)
rightEdgeLabel = printValues(node@right_levels,
na_direction=='RIGHT', 4)
}else {
leftEdgeLabel = paste("<", node@threshold,
ifelse(na_direction=='LEFT',',NA',''))
rightEdgeLabel = paste(">=", node@threshold,
ifelse(na_direction=='RIGHT',',NA',''))
}
left_node = node@left_child
right_node = node@right_child
if(class(left_node)[[1]] == 'H2OLeafNode')
leftLabel = paste("prediction:", left_node@prediction)
else
leftLabel = left_node@split_feature
if(class(right_node)[[1]] == 'H2OLeafNode')
rightLabel = paste("prediction:", right_node@prediction)
else
rightLabel = right_node@split_feature
if(leftLabel == rightLabel) {
leftLabel = paste(leftLabel, "(L)")
rightLabel = paste(rightLabel, "(R)")
}
dtreeLeft = dtree$AddChild(leftLabel)
dtreeLeft$edgeLabel = leftEdgeLabel
dtreeLeft$type = ifelse(class(left_node)[1] == 'H2OSplitNode', 'split', 'leaf')
dtreeRight = dtree$AddChild(rightLabel)
dtreeRight$edgeLabel = rightEdgeLabel
dtreeRight$type = ifelse(class(right_node)[1] == 'H2OSplitNode', 'split', 'leaf')
addChildren(dtreeLeft, left_node)
addChildren(dtreeRight, right_node)
return(FALSE)
}
printValues <- function(values, is_na_direction, n=4) {
l = length(values)
if(l == 0)
value_string = ifelse(is_na_direction, "NA", "")
else
value_string = paste0(paste0(values[1:min(n,l)], collapse = ', '),
ifelse(l > n, ",...", ""),
ifelse(is_na_direction, ", NA", ""))
return(value_string)
}
# Build and train the model:
h2o.init()
## Connection successful!
##
## R is connected to the H2O cluster:
## H2O cluster uptime: 1 days 6 hours
## H2O cluster timezone: Asia/Singapore
## H2O data parsing timezone: UTC
## H2O cluster version: 3.30.0.1
## H2O cluster version age: 2 months and 13 days
## H2O cluster name: H2O_started_from_R_pankaj_qto124
## H2O cluster total nodes: 1
## H2O cluster total memory: 3.26 GB
## H2O cluster total cores: 8
## H2O cluster allowed cores: 8
## H2O cluster healthy: TRUE
## H2O Connection ip: localhost
## H2O Connection port: 54321
## H2O Connection proxy: NA
## H2O Internal Security: FALSE
## H2O API Extensions: Amazon S3, XGBoost, Algos, AutoML, Core V3, TargetEncoder, Core V4
## R Version: R version 3.6.0 (2019-04-26)
emailDHex = as.h2o(emailDFrp)
##
|
| | 0%
|
|=================================================================| 100%
splits = h2o.splitFrame(data = emailDHex, ratios = .8, seed = 1234)
trainHex = splits[[1]]
validHex = splits[[2]]
predictors = c("isRe")
predictors = names(emailDFrp)[c(seq(2,29))]
response = c("isSpam")
emailD_2tree <- h2o.randomForest(x = predictors,
y = response,
ntrees = 5,
max_depth = 5,
min_rows = 20,
binomial_double_trees = TRUE,
training_frame = trainHex,
validation_frame = validHex)
##
|
| | 0%
|
|========================== | 40%
|
|=================================================================| 100%
emailDH2oTree2 = h2o.getModelTree(model = emailD_2tree, tree_number = 2)
library(data.tree)
library(DiagrammeR)
emailDataTree = createDataTree(emailDH2oTree2)
GetEdgeLabel <- function(node) {return (node$edgeLabel)}
GetNodeShape <- function(node) {switch(node$type,
split = "diamond", leaf = "oval")}
GetFontName <- function(node) {switch(node$type,
split = 'Palatino-bold',
leaf = 'Palatino')}
SetEdgeStyle(emailDataTree, fontname = 'Palatino-italic',
label = GetEdgeLabel, labelfloat = TRUE,
fontsize = "26", fontcolor='royalblue4')
SetNodeStyle(emailDataTree, fontname = GetFontName, shape = GetNodeShape,
fontsize = "26", fontcolor='royalblue4',
height="0.75", width="1")
SetGraphStyle(emailDataTree, rankdir = "LR", dpi=70.)
plot(emailDataTree, output = "graph")
emailDataTree
## levelName
## 1 bodyCharCt
## 2 ¦--isRe
## 3 ¦ ¦--numEnd
## 4 ¦ ¦ ¦--sortedRec
## 5 ¦ ¦ ¦ ¦--prediction: 0
## 6 ¦ ¦ ¦ °--perCaps
## 7 ¦ ¦ ¦ ¦--prediction: 0.5
## 8 ¦ ¦ ¦ °--prediction: 0.07070707
## 9 ¦ ¦ °--perHTML
## 10 ¦ ¦ ¦--hour
## 11 ¦ ¦ ¦ ¦--prediction: 0.89816934
## 12 ¦ ¦ ¦ °--prediction: 0.733871
## 13 ¦ ¦ °--prediction: 0
## 14 ¦ °--isInReplyTo
## 15 ¦ ¦--subBlanks
## 16 ¦ ¦ ¦--hour (L)
## 17 ¦ ¦ ¦ ¦--prediction: 0.95348835
## 18 ¦ ¦ ¦ °--prediction: 1
## 19 ¦ ¦ °--hour (R)
## 20 ¦ ¦ ¦--prediction: 1
## 21 ¦ ¦ °--prediction: 0.95
## 22 ¦ °--prediction: 1
## 23 °--numRec
## 24 ¦--isInReplyTo
## 25 ¦ ¦--perCaps
## 26 ¦ ¦ ¦--numRec
## 27 ¦ ¦ ¦ ¦--prediction: 0.6415094
## 28 ¦ ¦ ¦ °--prediction: 0.8785047
## 29 ¦ ¦ °--bodyCharCt
## 30 ¦ ¦ ¦--prediction: 0.11678832
## 31 ¦ ¦ °--prediction: 0.82857144
## 32 ¦ °--prediction: 1
## 33 °--prediction: 0
h2o.shutdown()
## Are you sure you want to shutdown the H2O instance running at http://localhost:54321/ (Y/N)?